/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package sk.lieskove.jianghongtiao.websearch.document.preprocessing.actions;

import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import sk.lieskove.jianghongtiao.common.extract.text.MSOffice2Text;
import sk.lieskove.jianghongtiao.websearch.service.ExcludeReasonEnum;
import sk.lieskove.jianghongtiao.websearch.persistence.WebSearchResponse;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;

/**
 * 
 * @author xjuraj e-mail: jjurco.sk_gmail.com
 */
public class PoiTextExtractor implements Serializable, PreprocessAction {

    private transient Logger log = Logger.getLogger(PoiTextExtractor.class);

    @Override
    public WebSearchResponse preprocess(WebSearchResponse response) {
        if (response.getFile() == null) {
            return response;
        }
        try {
            MSOffice2Text office2Text = new MSOffice2Text();
            
            String extractedText = office2Text.extractText(response.getFile());
            
            if ("".equals(extractedText)) {
                response.setExcludeReasonEnum(ExcludeReasonEnum.POI_NOT_SUPPORTED);
                return response;
            }
            //save extracted text to file
            File result = sk.lieskove.jianghongtiao.common.utils.FileUtils.getTempFile("POI_", "");
            FileUtils.writeStringToFile(result, extractedText, "UTF-8");
            response.setFile(result);
        } catch (IOException ex) {
            log.error("I/O exception the file: " + response.getFile().getAbsolutePath());
        }
        return response;
    }

}
